library(tictoc)
library(formattable)
library(data.table)
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
library(GGally)
library(plotly)
library(gridExtra)
library(corrplot)
library(caret)
library(ggthemes)
library(RColorBrewer)
library(fmsb)
library(rpart.plot)
library(ROCR)
library(mlr3learners.gbm)
library(ranger)
library(mlr3)
library(mlr3learners)
library(mlr3measures)
library(mlr3pipelines)
library(mlr3tuning)
library(mlr3filters)
library(mlbench)
library(mlr3misc)
library(caret)
library(mlr3viz)
library(corrplot)
library(mlr3verse)
original Datasets from Kaggle: Here pokemon800.csv : Information about each Pokemon combats.csv : first Pokemon ID, second Pokemon ID, winner ID
pokemon <-read.csv("pokemon800.csv", header = TRUE, stringsAsFactors=TRUE) #read dataset
combats <-read.csv("combats.csv", header = TRUE, stringsAsFactors = TRUE) #read dataset
colnames(pokemon)<-c("id","name","type_1","type_2","hp","attack","defense",
"sp_attack","sp_defense","speed","generation","is_legendary")
pokemon$id <- as.character(pokemon$id)
In order to predict winner Pokemon, we need to create new Pokemon dataset using Pokemon information, as dataset “combats” includes only the information of Pokemon IDs.
daten <- combats
#find names
daten$First_pokemon_name<-sapply(daten$First_pokemon, function(x) pokemon$name[match(x, pokemon$id)])
daten$Second_pokemon_name<-sapply(daten$Second_pokemon, function(x) pokemon$name[match(x, pokemon$id)])
daten$Winner_name<-sapply(daten$Winner, function(x) pokemon$name[match(x, pokemon$id)])
daten$First_wins <- daten$First_pokemon == daten$Winner
# calculate stat differences
daten$First_pokemon_attack<-sapply(daten$First_pokemon, function(x) pokemon$attack[match(x, pokemon$id)])
daten$Second_pokemon_attack<-sapply(daten$Second_pokemon, function(x) pokemon$attack[match(x, pokemon$id)])
daten$First_pokemon_hp<-sapply(daten$First_pokemon, function(x) pokemon$hp[match(x, pokemon$id)])
daten$Second_pokemon_hp<-sapply(daten$Second_pokemon, function(x) pokemon$hp[match(x, pokemon$id)])
daten$First_pokemon_defense<-sapply(daten$First_pokemon, function(x) pokemon$defense[match(x, pokemon$id)])
daten$Second_pokemon_defense<-sapply(daten$Second_pokemon, function(x) pokemon$defense[match(x, pokemon$id)])
daten$First_pokemon_sp_atk<-sapply(daten$First_pokemon, function(x) pokemon$sp_attack[match(x, pokemon$id)])
daten$Second_pokemon_sp_atk<-sapply(daten$Second_pokemon, function(x) pokemon$sp_attack[match(x, pokemon$id)])
daten$First_pokemon_sp_def<-sapply(daten$First_pokemon, function(x) pokemon$sp_defense[match(x, pokemon$id)])
daten$Second_pokemon_sp_def<-sapply(daten$Second_pokemon, function(x) pokemon$sp_defense[match(x, pokemon$id)])
daten$First_pokemon_speed<-sapply(daten$First_pokemon, function(x) pokemon$speed[match(x, pokemon$id)])
daten$Second_pokemon_speed<-sapply(daten$Second_pokemon, function(x) pokemon$speed[match(x, pokemon$id)])
daten$attackVSattack_diff <- daten$First_pokemon_attack-daten$Second_pokemon_attack
daten$defenseVSdefense_diff <- daten$First_pokemon_defense-daten$Second_pokemon_defense
daten$sp_atkVSsp_atk_diff <- daten$First_pokemon_sp_atk-daten$Second_pokemon_sp_atk
daten$sp_defVSsp_def_diff <- daten$First_pokemon_sp_def-daten$Second_pokemon_sp_def
daten$speedVSspeed_diff <- daten$First_pokemon_speed-daten$Second_pokemon_speed
daten$HPVSHP_diff <- daten$First_pokemon_hp-daten$Second_pokemon_hp
#first Pokemon faster?
daten$First_pokemon_faster <- sign(daten$speedVSspeed_diff)
#add legendary status
daten$First_pokemon_legendary<-sapply(daten$First_pokemon, function(x) pokemon$is_legendary[match(x, pokemon$id)])%>%as.logical()
daten$Second_pokemon_legendary<-sapply(daten$Second_pokemon, function(x) pokemon$is_legendary[match(x, pokemon$id)])%>%as.logical()
# higher speed is defenitely an advantage for a Pokemon in a battle, since it defines which Pokemon may act first
# If the speed of two fighting Pokemons is the same, the beginner will be defined randomly
# This means positive values should have a positive impact on First_wins and negative values a negative impact. A value of zero however should have no impact since the beginner is defined completely random.
# Here you can see that speed difference has the highest influence on our model, but still there are no wins for the First_pokemon in case of zero speed difference.
table(daten$First_pokemon_faster,daten$First_wins)
##
## FALSE TRUE
## -1 23218 1120
## 0 1328 0
## 1 1853 22481
# Since this is very unlikely to happen randomly, we assume that there is a data error for those specific observations
#exclude all of those observations with equal speed of both Pokemon
daten$First_wins <- daten$First_wins %>% as.factor()
daten <- daten[!daten$First_pokemon_faster == 0,]
# convert the logical variables into factor
for(i in 1:ncol(daten)){
if(class(daten[,i]) == "logical"){
daten[,i] <- as.factor(daten[,i])
}
}
Select useful variables and make temporal task for ML project Task : Binary Classification Target Value to predict : Is first Pokemon the winner? (if not, second Pokemon wins of course :)) True/False
Backend <- daten %>% dplyr::select(c("attackVSattack_diff",
"defenseVSdefense_diff",
"sp_atkVSsp_atk_diff",
"sp_defVSsp_def_diff",
"speedVSspeed_diff",
"HPVSHP_diff",
"First_pokemon_legendary",
"Second_pokemon_legendary",
"First_wins"))
task_combat = TaskClassif$new(id = "task_combat", backend = Backend, target = "First_wins")
- for tree model :
information gain decides which feature should be used to split the data
filter = flt("information_gain")
filter$calculate(task_combat)
formattable(as.data.table(filter), list(score = color_tile("transparent", "lightpink")))
| feature | score |
|---|---|
| speedVSspeed_diff | 0.471350986 |
| attackVSattack_diff | 0.073598085 |
| sp_atkVSsp_atk_diff | 0.068886552 |
| sp_defVSsp_def_diff | 0.033694959 |
| HPVSHP_diff | 0.031579065 |
| First_pokemon_legendary | 0.015948659 |
| Second_pokemon_legendary | 0.015053922 |
| defenseVSdefense_diff | 0.009023457 |
Backend <- daten %>% dplyr::select(speedVSspeed_diff,
attackVSattack_diff,
sp_atkVSsp_atk_diff,
sp_defVSsp_def_diff,
HPVSHP_diff,
First_wins)
task_combat = TaskClassif$new(id = "task_combat", backend = Backend, target = "First_wins")
learner_glmnet <- mlr3::lrn("classif.glmnet", predict_type = "prob")
learner_log <- mlr3::lrn("classif.log_reg", predict_type = "prob")
learner_nb <- mlr3::lrn("classif.naive_bayes", predict_type = "prob")
learner_kknn <- mlr3::lrn("classif.kknn", predict_type = "prob")
learner_RF <- mlr3::lrn("classif.ranger", predict_type = "prob")
learner_xgboost <- mlr3::lrn("classif.xgboost", predict_type = "prob")
fencoder = po("encode", method = "treatment",
affect_columns = selector_type("factor"))
fencoder$train(list(task_combat))
## $output
## <TaskClassif:task_combat> (48672 x 6)
## * Target: First_wins
## * Properties: twoclass
## * Features (5):
## - int (5): HPVSHP_diff, attackVSattack_diff, sp_atkVSsp_atk_diff, sp_defVSsp_def_diff,
## speedVSspeed_diff
pipe = fencoder %>>% learner_glmnet
learner_glmnet = GraphLearner$new(pipe)
pipe = fencoder %>>% learner_log
learner_log = GraphLearner$new(pipe)
pipe = fencoder %>>% learner_nb
learner_nb = GraphLearner$new(pipe)
pipe = fencoder %>>% learner_kknn
learner_kknn = GraphLearner$new(pipe)
pipe = fencoder %>>% learner_RF
learner_RF = GraphLearner$new(pipe)
pipe = fencoder %>>% learner_xgboost
learner_xgboost = GraphLearner$new(pipe)
set.seed(1234)
train_set = sample(task_combat$nrow, 0.8 * task_combat$nrow)
test_set = setdiff(seq_len(task_combat$nrow), train_set)
set.seed(1234)
learner_glmnet$train(task_combat, row_ids = train_set)
learner_log$train(task_combat, row_ids = train_set)
learner_nb$train(task_combat, row_ids = train_set)
learner_kknn$train(task_combat, row_ids = train_set)
learner_RF$train(task_combat, row_ids = train_set)
learner_xgboost$train(task_combat, row_ids = train_set)
## [1] "glmnet prediction performances :"
## classif.ce classif.acc
## 0.90652286 0.09347714
## [1] "logreg prediction performances :"
## classif.ce classif.acc
## 0.09912686 0.90087314
## [1] "naive bayes prediction performances :"
## classif.ce classif.acc
## 0.1633282 0.8366718
## [1] "kknn prediction performances :"
## classif.ce classif.acc
## 0.07693888 0.92306112
## [1] "ragner(RF) prediction performances :"
## classif.ce classif.acc
## 0.05649718 0.94350282
## [1] "xgboost prediction performances :"
## classif.ce classif.acc
## 0.05721623 0.94278377
terminator <- term("evals", n_evals = 100)
tuner <- tnr("grid_search")
resample_inner <- rsmp("cv", folds = 10)
measures <- msrs(c("classif.ce", "classif.acc"))
# Glmnet is a package that fits a generalized linear model via penalized maximum likelihood.Gaussian is the default family in glmnet and glmnet provides various options for users to customise the fit,the 3 option chosen here are :
# alpha is for the elastic-net mixing parameter α, with range α∈[0,1]. α=1 is the lasso (default) and α=0 is the ridge.
# Value(s)of the penalty parameter lambda at which predictions are required. Default is the entire sequence used to create the model.
# eps is the minimum value of lambda.min.ratio ; factory default= 1.0e-6
tune_ps <- ParamSet$new(
params = list(ParamDbl$new("classif.glmnet.alpha",lower=0.8,upper=0.9),
ParamDbl$new("classif.glmnet.s",lower=0.05,upper=0.07),
ParamDbl$new("classif.glmnet.eps",lower = 0.0000000004,upper=0.00005)
)
)
learner_tunedglmnet <- AutoTuner$new(
learner = learner_glmnet,
resampling = resample_inner,
measures = measures,
tune_ps = tune_ps,
terminator = terminator,
tuner = tuner)
set.seed(1234)
tic("start training")
learner_tunedglmnet$train(task_combat, row_ids = train_set)
toc()
tune_ps <- ParamSet$new(
params = list(ParamInt$new("classif.log_reg.maxit",lower=10,upper=50),
ParamDbl$new("classif.log_reg.epsilon",lower=0.000000001,upper=0.000011)
)
)
learner_tunedlog <- AutoTuner$new(learner_log,resample_inner,measures,tune_ps, terminator, tuner)
set.seed(1234)
tic("start training")
learner_tunedlog$train(task_combat, row_ids = train_set)
toc()
# Threshold is the value by which zero probabilities or probabilities within the epsilon-range corresponding to metric variables are replaced
# Laplace (additive) smoothing handles categorical variables .
# eps is the value that specifies an epsilon-range to replace zero or close to zero probabilities by threshold. It applies to metric variables.
tune_ps <- ParamSet$new(
params = list(ParamDbl$new("classif.naive_bayes.laplace",lower = 0,upper=1),
ParamDbl$new("classif.naive_bayes.threshold",lower=0.04444,upper=1),
ParamDbl$new("classif.naive_bayes.eps",lower=0.00000000001,upper = 0.000000001)
)
)
learner_tunednb <- AutoTuner$new(
learner = learner_nb,
resampling = resample_inner,
measures = measures,
tune_ps = tune_ps,
terminator = terminator,
tuner = tuner)
set.seed(1234)
tic("start training")
learner_tunednb$train(task_combat, row_ids = train_set)
toc()
# for KKNN the main aspects are:
# - the distance measure which is used to define similarity (respectively dissimilarity) between the Pokemon battles. mlr3-kknn supports the Minowski-distance with different parameters. We will consider distance-parameter 1,2 and 3.
# - Minowksi-distance with parameter 1 is the Manhattan-distance
# - Minowksi-distance with parameter 2 is the Euclidean distance
# - Minowski-distance with parameter 3 is not so commonly used, but could also lead to better results in specific cases
## - the number of closest neighbors considered to predict an observation (k)
# - we have < 50000 observations in the data set and ~ 800 different Pokemon which could be used in a battle.
# - this means the probability for two Pokemon to be selected as competitors in a battle is ~ (1/800) * (1/800) * 2 = 0.000003125
# - this means we expect 0.000003125 * 50000 = 0.15625 battles between each pair of Pokemons. Since Pokemon fights could be estimated best by looking at the same competitors results in the past, we expect a rather low k to be the best parameter. Since this assumption is based on intuition rather than on scientific proof, we still choose a rather high parameter value limit for k (200).
tune_ps <- ParamSet$new(
params = list(ParamInt$new("classif.kknn.k",lower=1,upper=200),
ParamInt$new("classif.kknn.distance",lower=1,upper=3)
)
)
learner_tunedkknn <- AutoTuner$new(
learner = learner_kknn,
resampling = resample_inner,
measures = measures,
tune_ps = tune_ps,
terminator = terminator,
tuner = tuner)
set.seed(1234)
tic("start training")
learner_tunedkknn$train(task_combat, row_ids = train_set)
toc()
tune_ps <- ParamSet$new(list(
ParamInt$new("classif.ranger.num.trees", lower = 10, upper = 300),
#ParamFct$new("classif.ranger.importance", levels = c('none','impurity','impurity_corrected','permutation')), #ecommended "impurity"
#ParamFct$new("classif.ranger.splitrule", levels = c('variance','extratrees','maxstat')), # Gini (but here no gini? #option: variance,extratrees,maxstat)
ParamInt$new("classif.ranger.min.node.size", lower = 1, upper = 10),
#ParamInt$new("classif.ranger.num.random.splits", lower = 1, upper = 10, default = 1)
ParamInt$new("classif.ranger.mtry", lower = 1, upper = 1)
# if you change mtry, mse increases here, hence default values got used.
#ParamDbl$new("classif.ranger.alpha", lower = 0.3, upper =0.8, default = 0.5)
))
learner_tunedRF <- AutoTuner$new(
learner = learner_RF,
resampling = resample_inner,
measures = measures,
tune_ps = tune_ps,
terminator = terminator,
tuner = tuner
)
set.seed(1234)
tic("start training")
learner_tunedRF$train(task_combat, row_ids = train_set)
toc()
tune_ps <- ParamSet$new(list(
ParamDbl$new("classif.xgboost.eta", lower = 0.3, upper = 0.5), # default 0.3
# eta : learning rate.
# eta increasing -> training error decreases faster in the training
# Step size shrinkage used in update to prevents overfitting. After each boosting step,
# we can directly get the weights of new features, and eta shrinks the feature weights to make the boosting process
# more conservative.
ParamDbl$new("classif.xgboost.gamma", lower = 0, upper = 0), # default 0 gamma = 0 is the best here
# gamma : Minimum loss reduction required to make a further partition on a leaf node of the tree.
# gamma increasing -> training error decreases slower in the training
# The larger gamma is, the more conservative the algorithm will be.
ParamInt$new("classif.xgboost.max_depth", lower = 15, upper = 20), # default 6
# Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit.
# 0 is only accepted in lossguided growing policy when tree_method is set as hist and it indicates no limit on depth.
# Beware that XGBoost aggressively consumes memory when training a deep tree.
ParamDbl$new("classif.xgboost.min_child_weight", lower = 0.01, upper= 0.01),#default 1
#min_child_weight : Minimum sum of instance weight (hessian) needed in a child.
#If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight,
#then the building process will give up further partitioning. In linear regression task,
#this simply corresponds to minimum number of instances needed to be in each node.
#The larger min_child_weight is, the more conservative the algorithm will be.
ParamInt$new("classif.xgboost.nrounds", lower = 30, upper = 100) # Number of Tree : more trees less train error
# If other hyperparameters would have been tuned well enough, nrounds needs to be big. if it increases, it could cause longer running time.
))
learner_tunedxgboost <- AutoTuner$new(
learner = learner_xgboost,
resampling = resample_inner,
measures = measures,
tune_ps = tune_ps,
terminator = terminator,
tuner = tuner
)
set.seed(1234)
tic("start training")
learner_tunedxgboost$train(task_combat, row_ids = train_set)
toc()
#glmnet
learner_tunedglmnet$tuning_result
## $tune_x
## $tune_x$classif.glmnet.alpha
## [1] 0.8
##
## $tune_x$classif.glmnet.s
## [1] 0.05
##
## $tune_x$classif.glmnet.eps
## [1] 5.555911e-06
##
##
## $params
## $params$encode.method
## [1] "treatment"
##
## $params$encode.affect_columns
## selector_type("factor")
##
## $params$classif.glmnet.alpha
## [1] 0.8
##
## $params$classif.glmnet.s
## [1] 0.05
##
## $params$classif.glmnet.eps
## [1] 5.555911e-06
##
##
## $perf
## classif.ce classif.acc
## 0.91337259 0.08662741
tunedglmnet_prediction = learner_tunedglmnet$predict(task_combat, row_ids = test_set)
#log_reg
learner_tunedlog$tuning_result
## $tune_x
## $tune_x$classif.log_reg.maxit
## [1] 19
##
## $tune_x$classif.log_reg.epsilon
## [1] 1.1e-05
##
##
## $params
## $params$encode.method
## [1] "treatment"
##
## $params$encode.affect_columns
## selector_type("factor")
##
## $params$classif.log_reg.maxit
## [1] 19
##
## $params$classif.log_reg.epsilon
## [1] 1.1e-05
##
##
## $perf
## classif.ce classif.acc
## 0.1008042 0.8991958
tunedlog_prediction = learner_tunedlog$predict(task_combat, row_ids = test_set)
#naive bayes
learner_tunednb$tuning_result
## $tune_x
## $tune_x$classif.naive_bayes.laplace
## [1] 1
##
## $tune_x$classif.naive_bayes.threshold
## [1] 0.36296
##
## $tune_x$classif.naive_bayes.eps
## [1] 2.3e-10
##
##
## $params
## $params$encode.method
## [1] "treatment"
##
## $params$encode.affect_columns
## selector_type("factor")
##
## $params$classif.naive_bayes.laplace
## [1] 1
##
## $params$classif.naive_bayes.threshold
## [1] 0.36296
##
## $params$classif.naive_bayes.eps
## [1] 2.3e-10
##
##
## $perf
## classif.ce classif.acc
## 0.1724583 0.8275417
tunednb_prediction = learner_tunednb$predict(task_combat, row_ids = test_set)
#kknn
learner_tunedkknn$tuning_result
## $tune_x
## $tune_x$classif.kknn.k
## [1] 67
##
## $tune_x$classif.kknn.distance
## [1] 1
##
##
## $params
## $params$encode.method
## [1] "treatment"
##
## $params$encode.affect_columns
## selector_type("factor")
##
## $params$classif.kknn.k
## [1] 67
##
## $params$classif.kknn.distance
## [1] 1
##
##
## $perf
## classif.ce classif.acc
## 0.06618383 0.93381617
tunedkknn_prediction = learner_tunedkknn$predict(task_combat, row_ids = test_set)
#random forest (ranger)
learner_tunedRF$tuning_result
## $tune_x
## $tune_x$classif.ranger.num.trees
## [1] 203
##
## $tune_x$classif.ranger.min.node.size
## [1] 1
##
## $tune_x$classif.ranger.mtry
## [1] 1
##
##
## $params
## $params$encode.method
## [1] "treatment"
##
## $params$encode.affect_columns
## selector_type("factor")
##
## $params$classif.ranger.num.trees
## [1] 203
##
## $params$classif.ranger.min.node.size
## [1] 1
##
## $params$classif.ranger.mtry
## [1] 1
##
##
## $perf
## classif.ce classif.acc
## 0.05449829 0.94550171
tunedRF_prediction = learner_tunedxgboost$predict(task_combat, row_ids = test_set)
#xgboost
learner_tunedxgboost$tuning_result
## $tune_x
## $tune_x$classif.xgboost.eta
## [1] 0.3222222
##
## $tune_x$classif.xgboost.gamma
## [1] 0
##
## $tune_x$classif.xgboost.max_depth
## [1] 19
##
## $tune_x$classif.xgboost.min_child_weight
## [1] 0.01
##
## $tune_x$classif.xgboost.nrounds
## [1] 30
##
##
## $params
## $params$encode.method
## [1] "treatment"
##
## $params$encode.affect_columns
## selector_type("factor")
##
## $params$classif.xgboost.nrounds
## [1] 30
##
## $params$classif.xgboost.verbose
## [1] 0
##
## $params$classif.xgboost.eta
## [1] 0.3222222
##
## $params$classif.xgboost.gamma
## [1] 0
##
## $params$classif.xgboost.max_depth
## [1] 19
##
## $params$classif.xgboost.min_child_weight
## [1] 0.01
##
##
## $perf
## classif.ce classif.acc
## 0.05683553 0.94316447
tunedxgboost_prediction = learner_tunedxgboost$predict(task_combat, row_ids = test_set)
# tuned glmnet
glmnet_prediction$score(measures)
## classif.ce classif.acc
## 0.90652286 0.09347714
tunedglmnet_prediction$score(measures)
## classif.ce classif.acc
## 0.91319979 0.08680021
# tuned log_reg
log_prediction$score(measures)
## classif.ce classif.acc
## 0.09912686 0.90087314
tunedlog_prediction$score(measures)
## classif.ce classif.acc
## 0.09912686 0.90087314
# tuned naive bayes
nb_prediction$score(measures)
## classif.ce classif.acc
## 0.1633282 0.8366718
tunednb_prediction$score(measures)
## classif.ce classif.acc
## 0.1631228 0.8368772
# tuned KKNN
kknn_prediction$score(measures)
## classif.ce classif.acc
## 0.07693888 0.92306112
tunedkknn_prediction$score(measures)
## classif.ce classif.acc
## 0.06789933 0.93210067
# tuned ranger (RF)
RF_prediction$score(measures)
## classif.ce classif.acc
## 0.05649718 0.94350282
tunedRF_prediction$score(measures)
## classif.ce classif.acc
## 0.05536723 0.94463277
# tuned xgboost
xgboost_prediction$score(measures)
## classif.ce classif.acc
## 0.05721623 0.94278377
tunedxgboost_prediction$score(measures)
## classif.ce classif.acc
## 0.05536723 0.94463277
# for benchmarking,we first define our new models with the hyperparameters we got after tuning :
learner_glm_bench<- lrn("classif.glmnet",alpha=0.85,s=0.06,eps=0.0000000045, predict_type = "prob")
learner_log_bench<-lrn("classif.log_reg",maxit=20,epsilon=0.00000001, predict_type = "prob")
learner_naive_bench<- lrn("classif.naive_bayes",laplace=0.5,threshold=0.5,eps=0.0000000001, predict_type = "prob")
learner_kknn_bench<- lrn("classif.kknn",k=45, distance=1, predict_type = "prob")
learner_ranger_bench <-lrn("classif.ranger",num.trees = 203, min.node.size = 2, mtry=1, predict_type = "prob")
learner_xg_bench <-lrn("classif.xgboost",eta = 0.4777778, gamma = 0, max_depth = 15,
min_child_weight =0.01, nrounds = 30 ,predict_type = "prob")
learners<-list(learner_glm_bench,learner_log_bench,
learner_naive_bench,learner_kknn_bench
,learner_ranger_bench,learner_xg_bench)
design<-benchmark_grid(tasks = task_combat,learners = learners,resamplings =rsmp("cv", folds = 3))
bmr <-benchmark(design)
## INFO [12:32:56.572] Benchmark with 18 resampling iterations
## INFO [12:32:56.578] Applying learner 'classif.glmnet' on task 'task_combat' (iter 2/3)
## INFO [12:32:56.866] Applying learner 'classif.xgboost' on task 'task_combat' (iter 1/3)
## INFO [12:32:58.543] Applying learner 'classif.naive_bayes' on task 'task_combat' (iter 3/3)
## INFO [12:33:01.035] Applying learner 'classif.log_reg' on task 'task_combat' (iter 1/3)
## INFO [12:33:01.170] Applying learner 'classif.kknn' on task 'task_combat' (iter 1/3)
## INFO [12:33:37.869] Applying learner 'classif.naive_bayes' on task 'task_combat' (iter 2/3)
## INFO [12:33:40.359] Applying learner 'classif.glmnet' on task 'task_combat' (iter 3/3)
## INFO [12:33:40.645] Applying learner 'classif.ranger' on task 'task_combat' (iter 3/3)
## INFO [12:33:42.468] Applying learner 'classif.kknn' on task 'task_combat' (iter 2/3)
## INFO [12:34:19.371] Applying learner 'classif.kknn' on task 'task_combat' (iter 3/3)
## INFO [12:34:56.738] Applying learner 'classif.log_reg' on task 'task_combat' (iter 3/3)
## INFO [12:34:56.870] Applying learner 'classif.xgboost' on task 'task_combat' (iter 2/3)
## INFO [12:34:58.567] Applying learner 'classif.ranger' on task 'task_combat' (iter 2/3)
## INFO [12:35:00.349] Applying learner 'classif.ranger' on task 'task_combat' (iter 1/3)
## INFO [12:35:02.225] Applying learner 'classif.glmnet' on task 'task_combat' (iter 1/3)
## INFO [12:35:02.527] Applying learner 'classif.xgboost' on task 'task_combat' (iter 3/3)
## INFO [12:35:04.242] Applying learner 'classif.naive_bayes' on task 'task_combat' (iter 1/3)
## INFO [12:35:06.989] Applying learner 'classif.log_reg' on task 'task_combat' (iter 2/3)
## INFO [12:35:07.140] Finished benchmark
## nr resample_result task_id learner_id resampling_id iters classif.ce classif.acc
## 1: 1 <ResampleResult> task_combat classif.glmnet cv 3 0.91742686 0.08257314
## 2: 2 <ResampleResult> task_combat classif.log_reg cv 3 0.10200937 0.89799063
## 3: 3 <ResampleResult> task_combat classif.naive_bayes cv 3 0.17122781 0.82877219
## 4: 4 <ResampleResult> task_combat classif.kknn cv 3 0.06683514 0.93316486
## 5: 5 <ResampleResult> task_combat classif.ranger cv 3 0.05489809 0.94510191
## 6: 6 <ResampleResult> task_combat classif.xgboost cv 3 0.05777449 0.94222551
# Choose your two Pokemons and predict the winner using the Benchmark Best Learner
# Visulisation : The probability for first Pokemon to win via guage graph
# Inputs are Pokemon id numbers from the pokemon dataset
pokemonA <- "123"
pokemonB <- "321"
#train the best learner from Benchmark
learner_ranger_bench$train(task_combat)
#remember_prob remembers last probability of a battle prediction. This just has to be set before the first function use one time
remember_prob <- as.double(NA)
#this function runs the prediction for two fighting Pokemon and then visualizes the probability for Pokemon A to win.
who_wins <- function(pokemonA, pokemonB){
data_for_classification <- data.frame(row.names=1)
data_for_classification$First_pokemon <- pokemonA
data_for_classification$Second_pokemon <- pokemonB
# calculate stat differences
data_for_classification$First_pokemon_attack<-sapply(data_for_classification$First_pokemon, function(x) pokemon$attack[match(x, pokemon$id)])
data_for_classification$Second_pokemon_attack<-sapply(data_for_classification$Second_pokemon, function(x) pokemon$attack[match(x, pokemon$id)])
data_for_classification$First_pokemon_HP<-sapply(data_for_classification$First_pokemon, function(x) pokemon$hp[match(x, pokemon$id)])
data_for_classification$Second_pokemon_HP<-sapply(data_for_classification$Second_pokemon, function(x) pokemon$hp[match(x, pokemon$id)])
data_for_classification$First_pokemon_defense<-sapply(data_for_classification$First_pokemon, function(x) pokemon$defense[match(x, pokemon$id)])
data_for_classification$Second_pokemon_defense<-sapply(data_for_classification$Second_pokemon, function(x) pokemon$defense[match(x, pokemon$id)])
data_for_classification$First_pokemon_sp_atk<-sapply(data_for_classification$First_pokemon, function(x) pokemon$sp_attack[match(x, pokemon$id)])
data_for_classification$Second_pokemon_sp_atk<-sapply(data_for_classification$Second_pokemon, function(x) pokemon$sp_attack[match(x, pokemon$id)])
data_for_classification$First_pokemon_sp_def<-sapply(data_for_classification$First_pokemon, function(x) pokemon$sp_defense[match(x, pokemon$id)])
data_for_classification$Second_pokemon_sp_def<-sapply(data_for_classification$Second_pokemon, function(x) pokemon$sp_defense[match(x, pokemon$id)])
data_for_classification$First_pokemon_speed<-sapply(data_for_classification$First_pokemon, function(x) pokemon$speed[match(x, pokemon$id)])
data_for_classification$Second_pokemon_speed<-sapply(data_for_classification$Second_pokemon, function(x) pokemon$speed[match(x, pokemon$id)])
data_for_classification$attackVSattack_diff <- data_for_classification$First_pokemon_attack-data_for_classification$Second_pokemon_attack
data_for_classification$defenseVSdefense_diff <- data_for_classification$First_pokemon_defense-data_for_classification$Second_pokemon_defense
data_for_classification$sp_atkVSsp_atk_diff <- data_for_classification$First_pokemon_sp_atk-data_for_classification$Second_pokemon_sp_atk
data_for_classification$sp_defVSsp_def_diff <- data_for_classification$First_pokemon_sp_def-data_for_classification$Second_pokemon_sp_def
data_for_classification$speedVSspeed_diff <- data_for_classification$First_pokemon_speed-data_for_classification$Second_pokemon_speed
data_for_classification$HPVSHP_diff <- data_for_classification$First_pokemon_HP-data_for_classification$Second_pokemon_HP
#select the variables you need for your specific model
data_for_classification <<- data_for_classification %>%
dplyr::select(speedVSspeed_diff,
attackVSattack_diff,
sp_atkVSsp_atk_diff,
sp_defVSsp_def_diff,
HPVSHP_diff)
#insert final model here. "ranger" is placeholder
prediction <- learner_ranger_bench$predict_newdata(newdata=data_for_classification)
prob <- prediction$prob[,2]
fig <- plot_ly(
type = "indicator",
mode = "gauge+number+delta",
value = prob,
title = list(text = "Probability for First Pokemon to win", font = list(size = 30)),
delta = list(reference = remember_prob, increasing = list(color = "lightgreen"),decreasing=list(color="red")),
gauge = list(
axis = list(range = list(0, 1), tickwidth = 1, tickcolor = "black"),
bar = list(color = "seagreen"),
bgcolor = "palegray",
borderwidth = 2,
bordercolor = "gray",
steps = list(
list(range = c(0.0,0.2), color = "white"),
list(range = c(0.2,0.4), color = "white"),
list(range = c(0.4,0.6), color = "white"),
list(range = c(0.6,0.8), color = "white"),
list(range = c(0.8,1), color = "white"))))
fig <- fig %>%
layout(
margin = list(l=5,r=10),
paper_bgcolor = "white",
font = list(color = "gray", family = "Arial",size=20))
remember_prob <<- prob[1] %>% as.numeric()
fig
}
#Play1
who_wins(pokemonA = "284", pokemonB = "324")
#Play2
who_wins(pokemonA = "12", pokemonB = "45")
#Play3
who_wins(pokemonA = "1", pokemonB = "29")